Import and Explore Data

Import the diagnostic data to R IDE

MyData <- read.csv(file="diag.csv", header=TRUE, sep=",")
MyData_str<- data.frame(lapply(MyData, as.character), stringsAsFactors=FALSE)
str(MyData_str)
## 'data.frame':    101766 obs. of  4 variables:
##  $ patient_nbr: chr  "8222157" "55629189" "86047875" "82442376" ...
##  $ diag_1     : chr  "250.83" "276" "648" "8" ...
##  $ diag_2     : chr  "?" "250.01" "250" "250.43" ...
##  $ diag_3     : chr  "?" "255" "V27" "403" ...
head(MyData)
##   patient_nbr diag_1 diag_2 diag_3
## 1     8222157 250.83      ?      ?
## 2    55629189    276 250.01    255
## 3    86047875    648    250    V27
## 4    82442376      8 250.43    403
## 5    42519267    197    157    250
## 6    82637451    414    411    250

Gather all the diagnostic codes into one vector

diags<-c(as.character(MyData$diag_1),as.character(MyData$diag_2),as.character(MyData$diag_3))
head(diags)
## [1] "250.83" "276"    "648"    "8"      "197"    "414"
print(paste("The length of the vector: ",length(diags)))
## [1] "The length of the vector:  305298"

Get the number of the unique diagnostic codes

UniqueDiags<-unique(diags)
print(paste("The number of unique codes: ",length(UniqueDiags)))
## [1] "The number of unique codes:  916"

Combine diagnostic codes of diag 1,2 and 3 into a single value as diagnostic sets to find how many combinations of codes the dataset contain

diagSets=0
for( i in 0:nrow(MyData)){
  diagSets[i]=paste0(MyData$diag_1[i],'|',MyData$diag_2[i],'|',MyData$diag_3[i])
}
diagSets[0:10]
##  [1] "250.83|?|?"     "276|250.01|255" "648|250|V27"    "8|250.43|403"  
##  [5] "197|157|250"    "414|411|250"    "414|411|V45"    "428|492|250"   
##  [9] "398|427|38"     "434|198|486"
print(paste("The number of combinations: ",length(diagSets)))
## [1] "The number of combinations:  101766"

Get the number of unique combination sets of diagnostic codes

unique_diagSets<-unique(diagSets)
print(paste("The number of unique combinations: ",length(unique_diagSets)))
## [1] "The number of unique combinations:  58166"

Find the letters which the diagnostic codes utilizes

for(i in LETTERS){
 x<-diagE_unique<-UniqueDiags[startsWith(UniqueDiags,i)]
 if(length(x) !=0)
 {
   print(i)
 }
}
## [1] "E"
## [1] "V"

Tranform Data

Get all codes that start with V letter

diagV_unique<-UniqueDiags[startsWith(UniqueDiags, "V")]
diagV_unique
##  [1] "V57" "V58" "V55" "V53" "V45" "V66" "V56" "V26" "V71" "V54" "V67"
## [12] "V60" "V43" "V63" "V25" "V70" "V07" "V51" "V15" "V10" "V42" "V44"
## [23] "V65" "V12" "V23" "V17" "V72" "V49" "V18" "V14" "V46" "V64" "V61"
## [34] "V08" "V62" "V09" "V11" "V16" "V13" "V85" "V02" "V50" "V03" "V69"
## [45] "V86" "V27" "V22" "V01" "V06"
cat("\n")
print(paste("The number of unique values: ",length(diagV_unique)))
## [1] "The number of unique values:  49"

Matched: V08

Matched only if you add .01: V53,V71,V54,V25,V15,V61

Add 00 to the end of the code: V45,V67,V10,V12,V64,V13

Add 0 to the rest of the code

Modify the V-codes to match the Find-A-Code format

diagNot<-c("V53","V71","V54","V25","V15","V61")
diag00<-c("V45","V67","V10","V12","V64","V13")
diagNochange<-c("V08")

diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-diagV_unique[!(diagV_unique %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

nonMatchingCodes<-diagNot

# test the codes

unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "V")])
##  [1] "V570"  "V580"  "V550"  "V53"   "V4500" "V660"  "V560"  "V260" 
##  [9] "V71"   "V54"   "V6700" "V600"  "V430"  "V630"  "V25"   "V700" 
## [17] "V070"  "V510"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "V")])
##  [1] "V4500" "V15"   "V1000" "V420"  "V440"  "V650"  "V1200" "V570" 
##  [9] "V430"  "V230"  "V170"  "V720"  "V580"  "V490"  "V180"  "V140" 
## [17] "V660"  "V460"  "V6400" "V61"   "V08"   "V53"   "V620"  "V090" 
## [25] "V54"   "V110"  "V160"  "V700"  "V1300" "V630"  "V850"  "V020" 
## [33] "V25"   "V500"  "V030"  "V690"  "V550"  "V860"  "V600"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "V")])
##  [1] "V270"  "V4500" "V430"  "V420"  "V700"  "V1000" "V15"   "V110" 
##  [9] "V140"  "V580"  "V170"  "V1200" "V440"  "V25"   "V6400" "V090" 
## [17] "V180"  "V660"  "V650"  "V570"  "V230"  "V160"  "V460"  "V08"  
## [25] "V53"   "V54"   "V490"  "V550"  "V220"  "V020"  "V630"  "V620" 
## [33] "V720"  "V600"  "V61"   "V850"  "V1300" "V010"  "V860"  "V030" 
## [41] "V070"  "V060"

Get all codes that start with E

diagE_unique<-UniqueDiags[startsWith(UniqueDiags, "E")]
diagE_unique
##  [1] "E909" "E878" "E812" "E932" "E888" "E939" "E937" "E944" "E870" "E849"
## [11] "E950" "E934" "E935" "E915" "E885" "E880" "E879" "E890" "E817" "E931"
## [21] "E924" "E942" "E947" "E930" "E858" "E929" "E933" "E900" "E936" "E941"
## [31] "E884" "E928" "E965" "E813" "E814" "E927" "E905" "E917" "E868" "E854"
## [41] "E918" "E850" "E887" "E881" "E829" "E919" "E916" "E819" "E826" "E938"
## [51] "E816" "E906" "E818" "E980" "E853" "E968" "E882" "E821" "E945" "E883"
## [61] "E949" "E920" "E956" "E904" "E943" "E861" "E852" "E876" "E855" "E815"
## [71] "E822" "E894" "E828" "E865" "E946" "E966" "E922" "E901" "E892" "E886"
## [81] "E987" "E912" "E955" "E864" "E825"
cat("\n")
print(paste("The number of unique values: ",length(diagE_unique)))
## [1] "The number of unique values:  85"

Matched: E956,E915,E918,E887,E916,E882,E894,E966,E892,E912

Add 0 at the end for the rest of the codes

Modify the E-codes to match the Find-A-Code format

diagNochange<-c("E956","E915","E918","E887","E916","E882","E894","E966","E892","E912")

diag0<-diagE_unique[!(diagE_unique %in% diagNochange)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

# test the codes

unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "E")])
## [1] "E9090"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "E")])
##  [1] "E8780" "E8120" "E9320" "E8880" "E9390" "E9370" "E9440" "E8700"
##  [9] "E8490" "E9500" "E9340" "E9350" "E915"  "E8850" "E8800" "E8790"
## [17] "E8900" "E8170" "E9310" "E9240" "E9420" "E9470" "E9300" "E8580"
## [25] "E9290" "E9330" "E9000" "E9360" "E9410" "E8840" "E9280" "E9650"
## [33] "E8130" "E8140" "E9270" "E9050" "E9170" "E8680" "E8540" "E918" 
## [41] "E8500" "E887"  "E8810" "E8290" "E9190" "E916"  "E8190" "E8260"
## [49] "E9380" "E8160" "E9060" "E8180" "E9800" "E8530" "E9680" "E882" 
## [57] "E8210" "E9450" "E8830"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "E")])
##  [1] "E8880" "E9320" "E8780" "E8850" "E9340" "E8840" "E9330" "E8790"
##  [9] "E9350" "E8490" "E9420" "E9470" "E9500" "E9490" "E9290" "E9380"
## [17] "E8160" "E9200" "E9280" "E9390" "E9310" "E9300" "E956"  "E9450"
## [25] "E8170" "E9040" "E8700" "E9800" "E9430" "E8610" "E8500" "E9440"
## [33] "E8800" "E8830" "E9360" "E9050" "E9410" "E8520" "E887"  "E8580"
## [41] "E9270" "E8760" "E9370" "E9170" "E8120" "E9240" "E916"  "E8190"
## [49] "E9650" "E8550" "E9060" "E9190" "E8150" "E8810" "E882"  "E8220"
## [57] "E8260" "E8130" "E8180" "E894"  "E915"  "E8530" "E8280" "E8650"
## [65] "E9460" "E966"  "E9220" "E9010" "E892"  "E8860" "E9870" "E912" 
## [73] "E9550" "E8640" "E8250" "E9000" "E8540"

Get the single digit number

lessThan2<-diags[nchar(diags)==1]
unique(lessThan2)
## [1] "8" "?" "3" "7" "5" "9"
cat("\n")
print(paste("The number of unique values: ",length(lessThan2)))
## [1] "The number of unique values:  2797"
na<-lessThan2[startsWith(lessThan2, "?")]
cat("\n")
print(paste("The number of elements contain '?' : ",length(na)))
## [1] "The number of elements contain '?' :  1802"

Add double zero 8 = 00800

all the rest add 00 before and 0 after

Replace ‘?’ with “”

MyData_str[MyData_str=="?"]<-""
head(MyData_str)
##   patient_nbr diag_1 diag_2 diag_3
## 1     8222157 250.83              
## 2    55629189    276 250.01    255
## 3    86047875    648    250   V270
## 4    82442376      8 250.43    403
## 5    42519267    197    157    250
## 6    82637451    414    411    250

Transform the single digit numbers to 3 or more digits codes to match the Find-A-Code format

diag0<-c("3","7","5","9")

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0("00",i,"0")
}


MyData_str[MyData_str=="8"]<-paste0("00","8","00")

# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "00")])
## [1] "00800" "0030"  "0070"  "0050"  "0090"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "00")])
## [1] "00800" "0090"  "0050"  "0070"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "00")])
## [1] "00800" "0090"  "0050"  "0070"  "0030"

Get the doule digit numbers

lessThan3<-diags[nchar(diags)==2]
ulessThan3<-unique(lessThan3)
ulessThan3
##  [1] "38" "70" "82" "54" "78" "79" "41" "53" "49" "11" "47" "42" "39" "48"
## [15] "94" "35" "34" "88" "31" "75" "27" "61" "36" "57" "97" "66" "98" "52"
## [29] "84" "23" "58" "10" "40" "46" "96" "99" "17" "14"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan3)))
## [1] "The number of unique values:  38"

No Match: 58

Add 0 before and 00 after: 41,11,10,17,14

Add 0 before: 42,48,35,75,61,96

Add 0 before and 0 after for the rest of the code

Transform the double digit munbers to 3 or more digit codes to match the Find-A-Code format

diagNot<-c("58")
diag0Before00<-as.character(c(41,11,10,17,14))
diag0Before<-as.character(c(42,48,35,75,61,96))


diagNot0<-c(diagNot,diag0Before00,diag0Before)

diag0<-ulessThan3[!(ulessThan3 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0("0",i,"0")
}

for(i in diag0Before00){
  MyData_str[MyData_str==i]<-paste0("0",i,"00")
}

for(i in diag0Before){
  MyData_str[MyData_str==i]<-paste0("0",i)
}

nonMatchingCodes<-c(nonMatchingCodes,diagNot)
# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "01")])
## [1] "01100" "01000"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "03")])
## [1] "0380" "0340" "0310" "035"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "04")])
## [1] "04100" "042"   "0470"  "0490"

Create a vector of 3 digit numbers codes excluding V-codes

lessThan4<-diags[nchar(diags)==3]
lessThan4<-lessThan4[!startsWith(lessThan4, "V")]
unique(lessThan4)
##   [1] "276" "648" "197" "414" "428" "398" "434" "157" "518" "999" "410"
##  [12] "682" "402" "737" "572" "189" "786" "427" "996" "277" "584" "462"
##  [23] "473" "411" "174" "486" "998" "511" "432" "626" "295" "196" "618"
##  [34] "182" "845" "423" "808" "722" "403" "784" "707" "440" "151" "715"
##  [45] "997" "198" "564" "812" "590" "556" "578" "433" "569" "185" "536"
##  [56] "255" "599" "558" "574" "491" "560" "244" "577" "730" "188" "824"
##  [67] "332" "562" "291" "296" "510" "401" "263" "438" "493" "642" "625"
##  [78] "571" "738" "593" "807" "456" "446" "575" "820" "515" "780" "995"
##  [89] "235" "721" "787" "162" "724" "282" "514" "281" "530" "466" "435"
## [100] "789" "566" "822" "191" "557" "733" "455" "711" "482" "202" "280"
## [111] "553" "225" "154" "441" "349" "962" "592" "507" "386" "156" "200"
## [122] "728" "348" "459" "426" "388" "607" "337" "531" "596" "288" "656"
## [133] "573" "492" "220" "516" "210" "922" "286" "885" "958" "661" "969"
## [144] "227" "112" "404" "823" "532" "416" "346" "535" "453" "250" "595"
## [155] "211" "303" "852" "218" "782" "540" "457" "285" "431" "340" "550"
## [166] "351" "601" "723" "555" "153" "443" "380" "204" "424" "241" "358"
## [177] "694" "331" "345" "681" "447" "290" "158" "579" "436" "335" "309"
## [188] "654" "805" "799" "292" "183" "851" "458" "586" "311" "892" "305"
## [199] "293" "415" "591" "794" "803" "655" "429" "278" "658" "598" "729"
## [210] "585" "444" "604" "727" "214" "552" "284" "680" "708" "644" "481"
## [221] "821" "413" "437" "968" "756" "632" "359" "275" "512" "781" "420"
## [232] "368" "522" "294" "825" "135" "304" "320" "669" "868" "496" "826"
## [243] "567" "203" "251" "565" "161" "495" "297" "663" "576" "355" "850"
## [254] "287" "611" "840" "350" "726" "537" "620" "180" "366" "783" "751"
## [265] "716" "199" "464" "580" "836" "664" "283" "813" "966" "289" "965"
## [276] "184" "480" "608" "333" "972" "212" "117" "788" "924" "959" "621"
## [287] "238" "785" "714" "942" "710" "933" "508" "478" "844" "736" "233"
## [298] "397" "395" "201" "421" "253" "600" "494" "977" "659" "312" "614"
## [309] "647" "652" "646" "274" "861" "425" "527" "451" "485" "217" "442"
## [320] "970" "193" "160" "322" "581" "475" "623" "374" "582" "568" "465"
## [331] "801" "237" "376" "150" "461" "913" "226" "617" "987" "641" "298"
## [342] "790" "336" "362" "228" "513" "383" "746" "353" "911" "506" "873"
## [353] "155" "860" "534" "802" "141" "396" "310" "341" "242" "719" "239"
## [364] "533" "616" "519" "301" "989" "230" "385" "300" "853" "871" "570"
## [375] "848" "463" "934" "236" "361" "594" "501" "810" "643" "430" "528"
## [386] "205" "791" "983" "992" "490" "172" "171" "622" "306" "863" "864"
## [397] "474" "660" "759" "356" "634" "967" "551" "695" "187" "732" "747"
## [408] "323" "308" "370" "252" "152" "846" "164" "365" "718" "266" "720"
## [419] "344" "797" "170" "878" "904" "882" "843" "709" "973" "454" "686"
## [430] "939" "487" "229" "991" "483" "357" "692" "796" "693" "935" "936"
## [441] "800" "920" "261" "307" "262" "831" "145" "223" "839" "685" "179"
## [452] "964" "136" "324" "389" "815" "334" "143" "526" "588" "192" "394"
## [463] "917" "219" "325" "792" "717" "994" "990" "793" "207" "637" "195"
## [474] "373" "847" "827" "891" "814" "703" "865" "352" "627" "378" "342"
## [485] "886" "369" "745" "705" "816" "541" "986" "610" "633" "640" "753"
## [496] "173" "835" "379" "445" "272" "382" "945" "619" "881" "866" "405"
## [507] "916" "215" "893" "671" "928" "906" "897" "725" "867" "115" "890"
## [518] "734" "521" "674" "470" "834" "146" "696" "524" "980" "691" "384"
## [529] "142" "879" "246" "208" "448" "955" "653" "149" "245" "735" "883"
## [540] "854" "952" "838" "194" "163" "216" "147" "354" "477" "318" "880"
## [551] "921" "377" "471" "683" "175" "602" "982" "706" "375" "417" "131"
## [562] "347" "870" "148" "862" "817" "914" "360" "684" "314" "240" "915"
## [573] "971" "795" "988" "452" "963" "327" "731" "842" "645" "665" "110"
## [584] "944" "603" "923" "412" "363" "957" "976" "698" "299" "700" "273"
## [595] "974" "529" "605" "941" "806" "271" "837" "657" "895" "338" "523"
## [606] "542" "114" "543" "372" "583" "422" "615" "279" "500" "903" "919"
## [617] "875" "381" "804" "704" "649" "832" "133" "975" "833" "391" "690"
## [628] "319" "258" "910" "317" "484" "138" "343" "758" "701" "872" "905"
## [639] "752" "909" "918" "947" "520" "517" "912" "702" "111" "259" "953"
## [650] "712" "741" "713" "755" "742" "869" "907" "908" "472" "811" "137"
## [661] "754" "130" "269" "232" "316" "748" "256" "186" "948" "750" "302"
## [672] "140" "670" "268" "894" "260" "270" "460" "364" "123" "884" "927"
## [683] "525" "315" "139" "313" "122" "387" "951" "697" "943" "744" "243"
## [694] "956" "265" "597" "930" "132" "757" "624" "841" "877" "538" "876"
cat("\n")
print(paste("The number of unique values: ",length(unique(lessThan4))))
## [1] "The number of unique values:  704"

Get all 3 digit numbers that start with 1 from the new vector

lessThan41<-lessThan4[startsWith(lessThan4, "1")]
ulessThan41<-unique(lessThan41)
ulessThan41
##  [1] "197" "157" "189" "174" "196" "182" "151" "198" "185" "188" "162"
## [12] "191" "154" "156" "112" "153" "158" "183" "135" "161" "180" "199"
## [23] "184" "117" "193" "160" "150" "155" "141" "172" "171" "187" "152"
## [34] "164" "170" "145" "179" "136" "143" "192" "195" "173" "115" "146"
## [45] "142" "149" "194" "163" "147" "175" "131" "148" "110" "114" "133"
## [56] "138" "111" "137" "130" "186" "140" "123" "139" "122" "132"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan41)))
## [1] "The number of unique values:  65"

No Match: 187

Matched: 185,135,193,179,138

Add 00 at the end of the code: 173,115,131

Add 0 at the end to rest of codes

Transform the 3 digit numbers that start with 1 to match the Find-A-Code format

diagNot<-c("187")
diag00<-c("173","115","131")
diagNochange<-c("185","135","193","179","138")

diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-ulessThan41[!(ulessThan41 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

nonMatchingCodes<-c(nonMatchingCodes,diagNot)

# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "1")])
##  [1] "1970"  "1570"  "1890"  "1740"  "1960"  "1820"  "1510"  "1980" 
##  [9] "185"   "1880"  "1620"  "1910"  "1540"  "1560"  "1120"  "1530" 
## [17] "1580"  "1830"  "135"   "1610"  "1800"  "1990"  "1840"  "1170" 
## [25] "193"   "1600"  "1500"  "1550"  "1410"  "1720"  "1710"  "187"  
## [33] "1520"  "1640"  "1700"  "1450"  "179"   "1360"  "1430"  "1920" 
## [41] "1950"  "17300" "11500" "1460"  "1420"  "1490"  "1940"  "1630" 
## [49] "1470"  "1750"  "13100" "1480"  "1100"  "1140"  "1330"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "1")])
##  [1] "1570"  "1980"  "1740"  "135"   "1960"  "1970"  "1500"  "1530" 
##  [9] "1120"  "1620"  "1540"  "1170"  "179"   "1890"  "1550"  "1720" 
## [17] "1990"  "1910"  "1360"  "1510"  "185"   "138"   "13100" "1100" 
## [25] "1880"  "1820"  "1110"  "1560"  "193"   "1520"  "17300" "1370" 
## [33] "1300"  "1830"  "1630"  "1710"  "1640"  "1860"  "1450"  "1920" 
## [41] "11500" "1410"  "1400"  "1950"  "1140"  "1800"  "1230"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "1")])
##  [1] "1970"  "13100" "1960"  "1820"  "1620"  "1980"  "1530"  "1120" 
##  [9] "1550"  "138"   "135"   "1390"  "185"   "1170"  "1990"  "1220" 
## [17] "1360"  "1570"  "1740"  "1500"  "1100"  "1890"  "1510"  "1910" 
## [25] "1880"  "1700"  "1800"  "1580"  "1720"  "17300" "193"   "1540" 
## [33] "1610"  "11500" "179"   "1630"  "1520"  "1830"  "1640"  "1950" 
## [41] "1710"  "1560"  "1920"  "1860"  "1460"  "1320"  "1480"  "1110" 
## [49] "1230"  "1410"  "1750"

Get the 3 digit numbers that start with 2 from the vector

lessThan42<-lessThan4[startsWith(lessThan4, "2")]
ulessThan42<-unique(lessThan42)
ulessThan42
##  [1] "276" "277" "295" "255" "244" "291" "296" "263" "235" "282" "281"
## [12] "202" "280" "225" "200" "288" "220" "210" "286" "227" "250" "211"
## [23] "218" "285" "204" "241" "290" "292" "293" "278" "214" "284" "275"
## [34] "294" "203" "251" "297" "287" "283" "289" "212" "238" "233" "201"
## [45] "253" "274" "217" "237" "226" "298" "228" "242" "239" "230" "236"
## [56] "205" "252" "266" "229" "261" "262" "223" "219" "207" "272" "215"
## [67] "246" "208" "245" "216" "240" "299" "273" "271" "279" "258" "259"
## [78] "269" "232" "256" "268" "260" "270" "243" "265"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan42)))
## [1] "The number of unique values:  85"

No Match: 284,275,258,260,243

Matched: 220,217,226,261,262

Add 00 at the end of the code: 277,295,296,202,200,288,250,204,278,203,201,274,228,242,205,250,207,208,299,279

Add 0 at the end to rest of codes

Transform the 3 digit numbers that start with 2 to match the Find-A-Code format

diagNot<-c("284","275","258","260","243")
diag00<-c("277","295","296","202","200","288","250","204","278","203","201","274","228","242","205","250","207","208","299")
diagNochange<-c("220","217","226","261","262")

diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-ulessThan42[!(ulessThan42 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

nonMatchingCodes<-c(nonMatchingCodes,diagNot)

# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "2")])
##   [1] "250.83" "2760"   "250.7"  "27700"  "29500"  "250.6"  "250.4" 
##   [8] "250.11" "250.32" "2550"   "250.13" "2440"   "250.03" "250.8" 
##  [15] "2910"   "29600"  "2630"   "250.02" "250.42" "250.41" "250.22"
##  [22] "2350"   "250.82" "2820"   "2810"   "250.33" "250.12" "20200" 
##  [29] "2800"   "2250"   "250.81" "20000"  "28800"  "220"    "2100"  
##  [36] "2860"   "250.93" "2270"   "25000"  "2110"   "250.01" "2180"  
##  [43] "2850"   "20400"  "2410"   "2900"   "2920"   "2930"   "27800" 
##  [50] "2140"   "284"    "275"    "2940"   "250.31" "250.43" "20300" 
##  [57] "2510"   "250.1"  "2970"   "2870"   "250.2"  "250.3"  "2830"  
##  [64] "2890"   "2120"   "2380"   "250.23" "2330"   "250.5"  "20100" 
##  [71] "2530"   "250.92" "27400"  "217"    "250.53" "2370"   "226"   
##  [78] "2980"   "22800"  "24200"  "2390"   "2300"   "250.21" "2360"  
##  [85] "20500"  "2520"   "2660"   "2290"   "261"    "262"    "250.9" 
##  [92] "2230"   "2190"   "20700"  "2720"   "250.52" "2150"   "250.51"
##  [99] "2460"   "20800"  "2450"   "2160"   "250.91" "2400"   "29900" 
## [106] "2730"   "2710"   "2790"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "2")])
##   [1] "250.01" "25000"  "250.43" "28800"  "250.02" "2440"   "2760"  
##   [8] "2860"   "24200"  "250.03" "250.52" "2850"   "250.6"  "22800" 
##  [15] "250.82" "2940"   "250.51" "2800"   "27700"  "250.42" "27800" 
##  [22] "2720"   "20300"  "250.41" "250.13" "2930"   "2450"   "250.12"
##  [29] "250.53" "284"    "2920"   "2900"   "250.93" "2550"   "250.7" 
##  [36] "2870"   "20000"  "250.83" "250.11" "2830"   "250.81" "29500" 
##  [43] "20400"  "2710"   "2270"   "250.5"  "258"    "2530"   "250.91"
##  [50] "250.92" "250.4"  "20500"  "2110"   "2630"   "20200"  "250.23"
##  [57] "20100"  "2970"   "2180"   "220"    "250.22" "250.8"  "2790"  
##  [64] "2810"   "20800"  "29600"  "2890"   "2230"   "2910"   "217"   
##  [71] "2980"   "2250"   "250.9"  "2330"   "275"    "250.1"  "27400" 
##  [78] "2730"   "2410"   "2820"   "250.2"  "2140"   "29900"  "250.31"
##  [85] "2520"   "2590"   "2390"   "226"    "250.33" "250.32" "2510"  
##  [92] "2380"   "261"    "250.21" "2460"   "2400"   "2660"   "2690"  
##  [99] "2320"   "2560"   "262"    "250.3"  "2120"   "2150"   "2350"  
## [106] "2680"   "260"    "2700"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "2")])
##   [1] "2550"   "25000"  "250.6"  "250.01" "2630"   "250.42" "2760"  
##   [8] "250.41" "27800"  "250.02" "250.8"  "250.7"  "250.52" "250.82"
##  [15] "250.03" "29600"  "250.4"  "2440"   "250.51" "2800"   "2720"  
##  [22] "250.43" "250.92" "284"    "2850"   "20100"  "2920"   "2940"  
##  [29] "250.23" "250.53" "2820"   "2870"   "2180"   "28800"  "29500" 
##  [36] "2810"   "250.83" "2910"   "250.5"  "250.91" "2380"   "2930"  
##  [43] "2900"   "250.13" "250.12" "2860"   "2520"   "258"    "250.81"
##  [50] "20200"  "250.9"  "2890"   "24200"  "2530"   "250.22" "20300" 
##  [57] "250.93" "275"    "2110"   "2790"   "27400"  "2270"   "20800" 
##  [64] "22800"  "2400"   "2700"   "2660"   "2450"   "2830"   "2510"  
##  [71] "250.2"  "2730"   "27700"  "2460"   "2980"   "2590"   "20400" 
##  [78] "250.1"  "20500"  "20000"  "2350"   "2410"   "260"    "2390"  
##  [85] "261"    "250.11" "2710"   "2140"   "220"    "2560"   "243"   
##  [92] "2970"   "29900"  "2250"   "2330"   "250.3"  "262"    "2650"  
##  [99] "2160"   "250.21" "250.31" "226"    "2680"   "2230"   "2360"  
## [106] "217"    "2150"   "2300"

Get the 3 digit numbers that Start with 3

lessThan43<-lessThan4[startsWith(lessThan4, "3")]
ulessThan43<-unique(lessThan43)
ulessThan43
##  [1] "398" "332" "349" "386" "348" "388" "337" "346" "303" "340" "351"
## [12] "380" "358" "331" "345" "335" "309" "311" "305" "359" "368" "304"
## [23] "320" "355" "350" "366" "333" "397" "395" "312" "322" "374" "376"
## [34] "336" "362" "383" "353" "396" "310" "341" "301" "385" "300" "361"
## [45] "306" "356" "323" "308" "370" "365" "344" "357" "307" "324" "389"
## [56] "334" "394" "325" "373" "352" "378" "342" "369" "379" "382" "384"
## [67] "354" "318" "377" "375" "347" "360" "314" "327" "363" "338" "372"
## [78] "381" "391" "319" "317" "343" "316" "302" "364" "315" "313" "387"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan43)))
## [1] "The number of unique values:  88"

No Match: 350,312,362,323

Matched: 340,311,395,319,317,316

Add 00 at end of the code: 386,388,337,346,303,380,358,345,305,368,304,366,374,376,383,385,300,361,370,365,344,389,373,378,342,369,379,382,384,377,375,347,360,314,327,363,372,381,364,315

Add 0 at the end to rest of codes

Transform the 3 digit numbers that start with 3 to match the Find-A-Code format

diagNot<-as.character(c(350,312,362,323))
diag00<-as.character(c(386,388,337,346,303,380,358,345,305,368,304,366,374,376,383,385,300,361,370,365,344,389,373,378,342,369,379,382,384,377,375,347,360,314,327,363,372,381,364,315))
diagNochange<-as.character(c(340,311,395,319,317,316))

diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-ulessThan43[!(ulessThan43 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

nonMatchingCodes<-c(nonMatchingCodes,diagNot)

# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "3")])
##  [1] "3980"  "3320"  "3490"  "38600" "3480"  "38800" "33700" "34600"
##  [9] "30300" "340"   "3510"  "38000" "35800" "3310"  "34500" "3350" 
## [17] "3090"  "311"   "30500" "3590"  "36800" "30400" "3200"  "3550" 
## [25] "350"   "36600" "3330"  "3970"  "395"   "312"   "3220"  "37400"
## [33] "37600" "3360"  "362"   "38300" "3530"  "3960"  "3100"  "3410" 
## [41] "3010"  "38500" "30000" "36100" "3060"  "3560"  "323"   "3080" 
## [49] "37000" "36500" "34400" "3570"  "3070"  "3240"  "38900" "3340" 
## [57] "3940"  "3250"  "37300" "3520"  "37800" "34200" "36900" "37900"
## [65] "38200" "38400" "3540"  "3180"  "37700" "37500" "34700" "36000"
## [73] "31400" "32700" "36300" "3380"  "37200" "38100" "3910"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "3")])
##  [1] "3570"  "3960"  "340"   "30500" "34200" "30300" "3980"  "319"  
##  [9] "30000" "30400" "37200" "38200" "36800" "3090"  "3330"  "34500"
## [17] "3970"  "3010"  "3540"  "35800" "3310"  "31400" "37800" "33700"
## [25] "311"   "3480"  "317"   "3320"  "36900" "3490"  "38600" "34400"
## [33] "37700" "3520"  "3360"  "362"   "3430"  "38000" "3590"  "3240" 
## [41] "3940"  "38300" "38900" "36500" "3510"  "3060"  "34600" "3560" 
## [49] "3220"  "3410"  "3350"  "3200"  "3080"  "37300" "323"   "3070" 
## [57] "3250"  "312"   "3550"  "38100" "3100"  "37900" "37600" "36600"
## [65] "32700" "316"   "34700" "37400" "395"   "3180"  "3530"  "36000"
## [73] "3380"  "3020"  "350"   "38800" "36400"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "3")])
##  [1] "30500"  "30300"  "3320"   "362"    "3560"   "3570"   "3480"  
##  [8] "3970"   "3310"   "3490"   "31500"  "38200"  "30000"  "3960"  
## [15] "30400"  "33700"  "311"    "38600"  "34200"  "3070"   "319"   
## [22] "34600"  "38000"  "3090"   "34400"  "3940"   "3010"   "36500" 
## [29] "3340"   "3980"   "36800"  "34500"  "3130"   "37900"  "37300" 
## [36] "340"    "3510"   "3360"   "38900"  "36600"  "3100"   "3870"  
## [43] "3550"   "37200"  "36900"  "3430"   "31400"  "3540"   "3330"  
## [50] "37800"  "38300"  "35800"  "3410"   "323"    "3080"   "3180"  
## [57] "350"    "317"    "36000"  "3590"   "3350"   "3060"   "395"   
## [64] "38400"  "37700"  "34700"  "312"    "38500"  "37400"  "32700" 
## [71] "37600"  "36100"  "365.44" "38100"  "3530"   "3380"   "38800" 
## [78] "3910"   "37000"

Get the 3 digit numbers that Start with 4

lessThan44<-lessThan4[startsWith(lessThan4, "4")]
ulessThan44<-unique(lessThan44)
ulessThan44
##  [1] "414" "428" "434" "410" "402" "427" "462" "473" "411" "486" "432"
## [12] "423" "403" "440" "433" "491" "401" "438" "493" "456" "446" "466"
## [23] "435" "455" "482" "441" "459" "426" "492" "404" "416" "453" "457"
## [34] "431" "443" "424" "447" "436" "458" "415" "429" "444" "481" "413"
## [45] "437" "420" "496" "495" "464" "480" "478" "421" "494" "425" "451"
## [56] "485" "442" "475" "465" "461" "463" "430" "490" "474" "454" "487"
## [67] "483" "445" "405" "470" "448" "477" "471" "417" "452" "412" "422"
## [78] "484" "472" "460"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan44)))
## [1] "The number of unique values:  80"

No Match: 444,445,405,484

Matched: 462,431,481,496,485,475,463,430,490,470,452,412,460

Add 00 at the end of the code: 414,434,410,402,486,433,493,441,404,436,464,474

Add 0 at the end to rest of codes

Transform the 3 digit numbers that start with 4 to match the Find-A-Code format

diagNot<-as.character(c(444,445,405,484))
diagNochange<-as.character(c(462,431,481,496,485,475,463,430,490,470,452,412,460))
diag00<-as.character(c(414,434,410,402,486,433,493,441,404,436,464,474))


diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-ulessThan44[!(ulessThan44 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

nonMatchingCodes<-c(nonMatchingCodes,diagNot)

# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "4")])
##  [1] "41400" "4280"  "43400" "41000" "40200" "4270"  "462"   "4730" 
##  [9] "4110"  "48600" "4320"  "4230"  "4030"  "4400"  "43300" "4910" 
## [17] "4010"  "4380"  "49300" "4560"  "4460"  "4660"  "4350"  "4550" 
## [25] "4820"  "44100" "4590"  "4260"  "4920"  "40400" "4160"  "4530" 
## [33] "4570"  "431"   "4430"  "4240"  "4470"  "43600" "4580"  "4150" 
## [41] "4290"  "444"   "481"   "4130"  "4370"  "4200"  "496"   "4950" 
## [49] "46400" "4800"  "4780"  "4210"  "4940"  "4250"  "4510"  "485"  
## [57] "4420"  "475"   "4650"  "4610"  "463"   "430"   "490"   "47400"
## [65] "4540"  "4870"  "4830"  "445"   "405"   "470"   "4480"  "4770" 
## [73] "4710"  "4170"  "452"   "412"   "4220"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "4")])
##  [1] "4110"  "4920"  "4270"  "4030"  "4250"  "4560"  "4010"  "496"  
##  [9] "4280"  "41000" "4240"  "4910"  "4400"  "49300" "41400" "4130" 
## [17] "48600" "444"   "4260"  "462"   "40200" "4580"  "4150"  "4350" 
## [25] "4460"  "4470"  "4160"  "412"   "44100" "4820"  "4320"  "4660" 
## [33] "43400" "4590"  "4200"  "43300" "405"   "4370"  "4530"  "4510" 
## [41] "43600" "4780"  "40400" "470"   "4430"  "4230"  "4650"  "481"  
## [49] "4800"  "4380"  "4730"  "490"   "485"   "431"   "4550"  "484"  
## [57] "4290"  "452"   "4480"  "4210"  "4940"  "430"   "4870"  "4570" 
## [65] "4610"  "4830"  "4540"  "47400" "4420"  "4720"  "46400" "4220" 
## [73] "463"   "475"   "4950"  "4770"  "460"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "4")])
##  [1] "4030"  "48600" "4270"  "41400" "4160"  "4280"  "4820"  "4010" 
##  [9] "496"   "4240"  "4110"  "490"   "4910"  "4200"  "49300" "4250" 
## [17] "4920"  "4380"  "40400" "4400"  "4260"  "4610"  "4580"  "4130" 
## [25] "4730"  "4550"  "4590"  "4370"  "4530"  "40200" "4650"  "412"  
## [33] "43300" "4430"  "41000" "4150"  "43600" "44100" "4460"  "444"  
## [41] "4350"  "4660"  "4780"  "4290"  "4420"  "4870"  "4560"  "4800" 
## [49] "4230"  "43400" "4940"  "4470"  "405"   "4540"  "4510"  "4950" 
## [57] "4720"  "452"   "4320"  "4770"  "462"   "4570"  "46400" "445"  
## [65] "481"   "4210"  "470"   "460"   "4170"  "485"   "431"   "4830" 
## [73] "475"   "430"   "484"   "463"   "4480"

Get the 3 digit numbers that Start with 5

lessThan45<-lessThan4[startsWith(lessThan4, "5")]
ulessThan45<-unique(lessThan45)
ulessThan45
##  [1] "518" "572" "584" "511" "564" "590" "556" "578" "569" "536" "599"
## [12] "558" "574" "560" "577" "562" "510" "571" "593" "575" "515" "514"
## [23] "530" "566" "557" "553" "592" "507" "531" "596" "573" "516" "532"
## [34] "535" "595" "540" "550" "555" "579" "586" "591" "598" "585" "552"
## [45] "512" "522" "567" "565" "576" "537" "580" "508" "527" "581" "582"
## [56] "568" "513" "506" "534" "533" "519" "570" "594" "501" "528" "551"
## [67] "526" "588" "541" "521" "524" "529" "523" "542" "543" "583" "500"
## [78] "520" "517" "525" "597" "538"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan45)))
## [1] "The number of unique values:  82"

No match: 584,558,585,517

Matched: 515,514,566,586,591,570,501,541,542,500,538

Add 00 at the end of the code: 564,590,574,562,553,531,532,535,550,598,552,534,533,519,528,551,521,524,523

Add 0 at the end to rest of codes

Transform the 3 digit numbers that start with 5 to match the Find-A-Code format

diagNot<-as.character(c(584,558,585,517))
diagNochange<-as.character(c(515,514,566,586,591,570,501,541,542,500,538))
diag00<-as.character(c(564,590,574,562,553,531,532,535,550,598,552,534,533,519,528,551,521,524,523))


diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-ulessThan45[!(ulessThan45 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

# test the codes
nonMatchingCodes<-c(nonMatchingCodes,diagNot)
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "5")])
##  [1] "5180"  "5720"  "584"   "5110"  "56400" "59000" "5560"  "5780" 
##  [9] "5690"  "5360"  "5990"  "558"   "57400" "5600"  "5770"  "56200"
## [17] "5100"  "5710"  "5930"  "5750"  "515"   "514"   "5300"  "566"  
## [25] "5570"  "55300" "5920"  "5070"  "53100" "5960"  "5730"  "5160" 
## [33] "53200" "53500" "5950"  "5400"  "55000" "5550"  "5790"  "586"  
## [41] "591"   "59800" "585"   "55200" "5120"  "5220"  "5670"  "5650" 
## [49] "5760"  "5370"  "5800"  "5080"  "5270"  "5810"  "5820"  "5680" 
## [57] "5130"  "5060"  "53400" "53300" "51900" "570"   "5940"  "501"  
## [65] "52800" "55100" "5260"  "5880"  "541"   "52100" "52400" "5290" 
## [73] "52300" "542"   "5430"  "5830"  "500"   "58"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "5")])
##  [1] "5070"  "585"   "5710"  "5990"  "55300" "5110"  "5770"  "53500"
##  [9] "5180"  "566"   "57400" "5810"  "5300"  "5670"  "5120"  "5600" 
## [17] "5780"  "584"   "558"   "5730"  "5690"  "5360"  "56200" "591"  
## [25] "5750"  "515"   "5720"  "5760"  "5950"  "59000" "51900" "5370" 
## [33] "5680"  "5830"  "56400" "5960"  "53100" "5080"  "5130"  "55000"
## [41] "5100"  "5920"  "59800" "5790"  "570"   "5930"  "53200" "5570" 
## [49] "5800"  "5650"  "5560"  "55200" "5200"  "517"   "5160"  "5940" 
## [57] "52800" "501"   "5220"  "5060"  "5880"  "53300" "586"   "5550" 
## [65] "5430"  "5400"  "52100" "52400" "514"   "500"   "5270"  "542"  
## [73] "53400" "5290"  "52300"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "5")])
##  [1] "5820"  "585"   "5680"  "5990"  "5180"  "55300" "53100" "5110" 
##  [9] "56200" "5810"  "5290"  "5950"  "5600"  "5880"  "5690"  "584"  
## [17] "53500" "5930"  "56400" "5070"  "5250"  "5720"  "591"   "5760" 
## [25] "53300" "5780"  "5710"  "5360"  "5960"  "558"   "570"   "5770" 
## [33] "5120"  "59800" "57400" "5300"  "515"   "5830"  "5730"  "53200"
## [41] "5940"  "51900" "5550"  "5650"  "5800"  "5750"  "517"   "5570" 
## [49] "5160"  "586"   "5920"  "59000" "5220"  "5370"  "55200" "52800"
## [57] "5670"  "53400" "52100" "5060"  "566"   "52400" "5560"  "5270" 
## [65] "5790"  "5080"  "501"   "5430"  "514"   "5100"  "5970"  "542"  
## [73] "52300" "500"   "55000" "5400"  "538"

Get the 3 digit numbers that Start with 6

lessThan46<-lessThan4[startsWith(lessThan4, "6")]
ulessThan46<-unique(lessThan46)
ulessThan46
##  [1] "648" "682" "626" "618" "642" "625" "607" "656" "661" "601" "694"
## [12] "681" "654" "655" "658" "604" "680" "644" "632" "669" "663" "611"
## [23] "620" "664" "608" "621" "600" "659" "614" "647" "652" "646" "623"
## [34] "617" "641" "616" "643" "622" "660" "634" "695" "686" "692" "693"
## [45] "685" "637" "627" "610" "633" "640" "619" "671" "674" "696" "691"
## [56] "653" "683" "602" "684" "645" "665" "603" "698" "605" "657" "615"
## [67] "649" "690" "670" "697" "624"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan46)))
## [1] "The number of unique values:  71"

No Match: 645,690,624

Matched: 632,683,605

Add 00 ata the end of the code: 648,618,642,656,661,681,654,655,658,644,669,663,664,600,659,647,652,646,641,643,660,634,686,637,633,640,674,653,665,657,649,670

Add 0 at the end to rest of codes

Transform the 3 digit numbers that start with 6 to match the Find-A-Code format

diagNot<-as.character(c(645,690,624))
diagNochange<-as.character(c(632,683,605))
diag00<-as.character(c(648,618,642,656,661,681,654,655,658,644,669,663,664,600,659,647,652,646,641,643,660,634,686,637,633,640,674,653,665,657,649,670))


diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-ulessThan46[!(ulessThan46 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

nonMatchingCodes<-c(nonMatchingCodes,diagNot)

# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "6")])
##  [1] "64800" "6820"  "6260"  "61800" "64200" "6250"  "6070"  "65600"
##  [9] "66100" "6010"  "6940"  "68100" "65400" "65500" "65800" "6040" 
## [17] "6800"  "64400" "632"   "66900" "66300" "6110"  "6200"  "66400"
## [25] "6080"  "6210"  "60000" "65900" "6140"  "64700" "65200" "64600"
## [33] "6230"  "6170"  "64100" "6160"  "64300" "6220"  "66000" "63400"
## [41] "6950"  "68600" "6920"  "6930"  "6850"  "63700" "6270"  "6100" 
## [49] "63300" "64000" "6190"  "6710"  "67400" "6960"  "6910"  "65300"
## [57] "683"   "6020"  "6840"  "645"   "66500" "6030"  "6980"  "605"  
## [65] "65700" "6150"  "64900" "690"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "6")])
##  [1] "64800" "61800" "6260"  "6820"  "64200" "66100" "6140"  "6200" 
##  [9] "6210"  "6170"  "6250"  "68100" "6270"  "60000" "6070"  "6110" 
## [17] "6160"  "6030"  "6960"  "65800" "6010"  "6040"  "64600" "6230" 
## [25] "6910"  "6800"  "6950"  "6080"  "64700" "6930"  "683"   "6840" 
## [33] "6150"  "66300" "68600" "65200" "65900" "6920"  "6220"  "6940" 
## [41] "6100"  "63400" "605"   "6020"  "64400" "6190"  "65600" "6980" 
## [49] "66400" "6850"  "645"   "65400" "67400" "64100" "67000" "64900"
## [57] "66500"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "6")])
##  [1] "6270"  "6820"  "61800" "65400" "6250"  "68100" "64800" "66400"
##  [9] "65800" "6080"  "64400" "6160"  "6010"  "66000" "65300" "6020" 
## [17] "6200"  "66300" "64200" "6980"  "6140"  "6260"  "605"   "6170" 
## [25] "6100"  "60000" "65600" "6940"  "66500" "64700" "6920"  "6070" 
## [33] "6110"  "6950"  "6230"  "65900" "65500" "67000" "6210"  "64600"
## [41] "64100" "6190"  "6960"  "65200" "6850"  "6930"  "6800"  "6970" 
## [49] "66100" "64300" "690"   "68600" "6040"  "65700" "6840"  "6030" 
## [57] "67400" "64900" "624"   "6220"  "66900" "6710"

Get the 3 digit numbers that Start with 7

lessThan47<-lessThan4[startsWith(lessThan4, "7")]
ulessThan47<-unique(lessThan47)
ulessThan47
##  [1] "737" "786" "722" "784" "707" "715" "730" "738" "780" "721" "787"
## [12] "724" "789" "733" "711" "728" "782" "723" "799" "794" "729" "727"
## [23] "708" "756" "781" "726" "783" "751" "716" "788" "785" "714" "710"
## [34] "736" "790" "746" "719" "791" "759" "732" "747" "718" "720" "797"
## [45] "709" "796" "792" "717" "793" "703" "745" "705" "753" "725" "734"
## [56] "735" "706" "795" "731" "700" "704" "758" "701" "752" "702" "712"
## [67] "741" "713" "755" "742" "754" "748" "750" "744" "757"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan47)))
## [1] "The number of unique values:  75"

No Match: (family code) 780,787,799,790

Matched: 797,725,734

Add 00 ata the end of the code: 786,707,715,730,724,789,733,711,727,716,736,746,719,718,709,741,755,744

Add 0 at the end to rest of codes

Transform the 3 digit numbers that start with 7 to match the Find-A-Code format

diagNot<-as.character(c(780,787,799,790 ))
diagNochange<-as.character(c(797,725,734))
diag00<-as.character(c(786,707,715,730,724,789,733,711,727,716,736,746,719,718,709,741,755,744))


diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-ulessThan47[!(ulessThan47 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

nonMatchingCodes<-c(nonMatchingCodes,diagNot)

# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "7")])
##  [1] "7370"  "78600" "7220"  "7840"  "70700" "71500" "73000" "7380" 
##  [9] "780"   "7210"  "787"   "72400" "78900" "73300" "71100" "7280" 
## [17] "7820"  "7230"  "799"   "7940"  "7290"  "72700" "7080"  "7560" 
## [25] "7810"  "7260"  "7830"  "7510"  "71600" "7880"  "7850"  "7140" 
## [33] "7100"  "73600" "790"   "74600" "71900" "7910"  "7590"  "7320" 
## [41] "7470"  "71800" "7200"  "797"   "70900" "7960"  "7920"  "7170" 
## [49] "7930"  "7030"  "7450"  "7050"  "7530"  "725"   "734"   "7350" 
## [57] "7060"  "7950"  "7310"  "7000"  "7040"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "7")])
##  [1] "71500" "70700" "780"   "7880"  "78600" "7850"  "7290"  "7810" 
##  [9] "73300" "787"   "7100"  "7590"  "71100" "7840"  "78900" "790"  
## [17] "7210"  "7830"  "7220"  "7140"  "7230"  "72400" "73000" "7380" 
## [25] "7940"  "7310"  "799"   "71900" "7820"  "7260"  "7530"  "7280" 
## [33] "71600" "7910"  "7040"  "7580"  "7920"  "72700" "71800" "7010" 
## [41] "73600" "7050"  "7520"  "74600" "7450"  "7370"  "7020"  "70900"
## [49] "7960"  "7560"  "725"   "7170"  "7950"  "7120"  "7060"  "74100"
## [57] "7130"  "75500" "7420"  "7510"  "7540"  "734"   "7470"  "7930" 
## [65] "7480"  "797"   "7500"  "7030"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "7")])
##  [1] "7140"  "7810"  "70700" "71500" "7940"  "7840"  "72400" "73000"
##  [9] "78900" "7530"  "78600" "71100" "7370"  "7850"  "7880"  "7220" 
## [17] "787"   "780"   "7820"  "73300" "73600" "7030"  "71600" "7290" 
## [25] "799"   "790"   "7100"  "7040"  "7280"  "7310"  "7520"  "74600"
## [33] "70900" "71800" "7060"  "7260"  "7910"  "7210"  "7930"  "71900"
## [41] "734"   "7560"  "7920"  "7830"  "7050"  "7120"  "7230"  "7010" 
## [49] "7080"  "7450"  "72700" "7580"  "7510"  "7320"  "7170"  "7380" 
## [57] "7420"  "7350"  "725"   "74400" "7960"  "74100" "7590"  "7130" 
## [65] "7470"  "797"   "7540"  "75500" "7200"  "7950"  "7570"  "7020" 
## [73] "7500"

Get the 3 digit numbers that Start with 8

lessThan48<-lessThan4[startsWith(lessThan4, "8")]
ulessThan48<-unique(lessThan48)
ulessThan48
##  [1] "845" "808" "812" "824" "807" "820" "822" "885" "823" "852" "805"
## [12] "851" "892" "803" "821" "825" "868" "826" "850" "840" "836" "813"
## [23] "844" "861" "801" "873" "860" "802" "853" "871" "848" "810" "863"
## [34] "864" "846" "878" "882" "843" "800" "831" "839" "815" "847" "827"
## [45] "891" "814" "865" "886" "816" "835" "881" "866" "893" "897" "867"
## [56] "890" "834" "879" "883" "854" "838" "880" "870" "862" "817" "842"
## [67] "806" "837" "895" "875" "804" "832" "833" "872" "869" "811" "894"
## [78] "884" "841" "877" "876"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan48)))
## [1] "The number of unique values:  81"

No Match:

Matched:

Add 00 zeros at the end of the code: 845,812,807,820,823,852,805,851,803,821,868,813,961,801,853,810,864,800,831,839,815,814,865,816,835,881,866,890,834,854,838,880,842,806,804,832,833,872,811

Add 0 at the end to rest of codes

Transform the 3 digit numbers that start with 8 to match the Find-A-Code format

#diagNot<-as.character(c())
#diagNochange<-as.character(c())
diag00<-as.character(c(845,812,807,820,823,852,805,851,803,821,868,813,961,801,853,810,864,800,831,839,815,814,865,816,835,881,866,890,834,854,838,880,842,806,804,832,833,872,811))


diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-ulessThan48[!(ulessThan48 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "8")])
##  [1] "84500" "8080"  "81200" "8240"  "80700" "82000" "8220"  "8850" 
##  [9] "82300" "85200" "80500" "85100" "8920"  "80300" "82100" "8250" 
## [17] "86800" "8260"  "8500"  "8400"  "8360"  "81300" "8440"  "8610" 
## [25] "80100" "8730"  "8600"  "8020"  "85300" "8710"  "8480"  "81000"
## [33] "8630"  "86400" "8460"  "8780"  "8820"  "8430"  "80000" "83100"
## [41] "83900" "81500" "8470"  "8270"  "8910"  "81400" "86500" "8860" 
## [49] "81600" "83500" "88100" "86600" "8930"  "8970"  "8670"  "89000"
## [57] "83400" "8790"  "8830"  "85400" "83800" "88000" "8700"  "8620" 
## [65] "8170"  "84200" "80600" "8370"  "8950"  "8750"  "80400" "83200"
## [73] "83300"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "8")])
##  [1] "8670"  "88100" "8470"  "85200" "81600" "81300" "8610"  "80500"
##  [9] "8730"  "8500"  "8600"  "8910"  "8250"  "82100" "8080"  "81200"
## [17] "86400" "80700" "82000" "8240"  "87200" "81400" "8820"  "8400" 
## [25] "8020"  "8260"  "82300" "8440"  "83300" "8690"  "8700"  "81000"
## [33] "84500" "81100" "85100" "81500" "8370"  "80100" "86800" "8920" 
## [41] "8460"  "83200" "8830"  "84200" "80000" "8360"  "86500" "83100"
## [49] "80600" "8620"  "86600" "8790"  "8220"  "8710"  "8930"  "8940" 
## [57] "85300" "88000" "8630"  "8840"  "8430"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "8")])
##  [1] "81000" "8910"  "80500" "8820"  "8360"  "8730"  "81300" "8020" 
##  [9] "88100" "8830"  "86600" "82100" "89000" "84500" "81200" "82300"
## [17] "8670"  "82000" "80700" "81600" "86500" "8400"  "8250"  "8600" 
## [25] "8260"  "8240"  "8920"  "85400" "83100" "80100" "81100" "8470" 
## [33] "84200" "8080"  "8610"  "81500" "85100" "86400" "8710"  "85200"
## [41] "8620"  "8700"  "8440"  "81400" "8500"  "87200" "8840"  "86800"
## [49] "85300" "88000" "8930"  "8630"  "80000" "8220"  "8790"  "8480" 
## [57] "83800" "8370"  "8410"  "8770"  "8750"  "83400" "8760"

Get the 3 digit numbers that Start with 9

lessThan49<-lessThan4[startsWith(lessThan4, "9")]
ulessThan49<-unique(lessThan49)
ulessThan49
##  [1] "999" "996" "998" "997" "995" "962" "922" "958" "969" "968" "966"
## [12] "965" "972" "924" "959" "942" "933" "977" "970" "913" "987" "911"
## [23] "989" "934" "983" "992" "967" "904" "973" "939" "991" "935" "936"
## [34] "920" "964" "917" "994" "990" "986" "945" "916" "928" "906" "980"
## [45] "955" "952" "921" "982" "914" "915" "971" "988" "963" "944" "923"
## [56] "957" "976" "974" "941" "903" "919" "975" "910" "905" "909" "918"
## [67] "947" "912" "953" "907" "908" "948" "927" "951" "943" "956" "930"
cat("\n")
print(paste("The number of unique values: ",length(ulessThan49)))
## [1] "The number of unique values:  77"

No match: 959

Matched: 936,920,986

Add 00 at the end of the code: 996,998,997,969,965,924,942,989,945,928,952,944,923,941,903,948,927,943

Add 0 at the end to the rest of the codes

Transform the 3 digit numbers that start with 9 to match the Find-A-Code format

diagNot<-as.character(c(959))
diagNochange<-as.character(c(936,920,986))
diag00<-as.character(c(996,998,997,969,965,924,942,989,945,928,952,944,923,941,903,948,927,943))


diagNot0<-c(diagNot,diag00,diagNochange)

diag0<-ulessThan49[!(ulessThan49 %in% diagNot0)]

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

for(i in diag00){
  MyData_str[MyData_str==i]<-paste0(i,"00")
}

nonMatchingCodes<-c(nonMatchingCodes,diagNot)

# testing codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, "9")])
##  [1] "9990"  "99600" "99800" "99700" "9950"  "9620"  "9220"  "9580" 
##  [9] "96900" "9680"  "9660"  "96500" "9720"  "92400" "959"   "94200"
## [17] "9330"  "9770"  "9700"  "9130"  "9870"  "9110"  "98900" "9340" 
## [25] "9830"  "9920"  "9670"  "9040"  "9730"  "9390"  "9910"  "9350" 
## [33] "936"   "920"   "9640"  "9170"  "9940"  "9900"  "986"   "94500"
## [41] "9160"  "92800" "9060"  "9800"  "9550"  "95200" "9210"  "9820" 
## [49] "9140"  "9150"  "9710"  "9880"  "9630"  "94400" "92300" "9570" 
## [57] "9760"  "9740"  "94100" "90300" "9190"  "9750"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, "9")])
##  [1] "99800" "9990"  "99600" "99700" "9100"  "9580"  "9220"  "9330" 
##  [9] "96900" "920"   "9110"  "92400" "9050"  "9210"  "959"   "9670" 
## [17] "9090"  "9180"  "9470"  "9740"  "9120"  "92300" "9340"  "9770" 
## [25] "9150"  "9530"  "9190"  "94500" "9060"  "9620"  "9130"  "96500"
## [33] "9070"  "9080"  "9950"  "9920"  "9170"  "9160"  "9800"  "9680" 
## [41] "98900" "95200" "9900"  "9720"  "94400" "9630"  "9750"  "9940" 
## [49] "94800" "9870"  "9550"  "9910"  "94200" "92700"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, "9")])
##  [1] "99600" "99800" "9990"  "99700" "9180"  "92400" "9050"  "959"  
##  [9] "920"   "9340"  "9950"  "92300" "9580"  "94500" "9070"  "9530" 
## [17] "9120"  "96500" "9210"  "9190"  "9080"  "9510"  "9620"  "96900"
## [25] "9170"  "94300" "9670"  "9090"  "9060"  "9150"  "9700"  "9220" 
## [33] "9660"  "9720"  "9160"  "9560"  "9870"  "9100"  "94400" "9910" 
## [41] "9350"  "9330"  "9920"  "9130"  "9110"  "9550"  "9300"  "98900"
## [49] "94800" "92800" "95200" "9800"  "94200" "9710"

Get the 4 digit numbers excluding the E-code

lessThan5<-diags[nchar(diags)==4]
lessThan5<-lessThan5[!startsWith(lessThan5, "E")]
unique(lessThan5)
## character(0)
cat("\n")
print(paste("The number of unique values: ",length(unique(lessThan5))))
## [1] "The number of unique values:  0"

Zero 4 digit numbers

Get the 5 digit numbers

lessThan6<-diags[nchar(diags)==5]
unique(lessThan6)
## [1] "250.7" "250.6" "250.4" "250.8" "250.1" "250.2" "250.3" "250.5" "250.9"
cat("\n")
print(paste("The number of unique values: ",length(unique(lessThan6))))
## [1] "The number of unique values:  9"

Add 0 to the end for all entries

Transform the 5 digit numbers to match the Find-A-Code format

diag0<-c("250.7","250.6","250.4","250.8","250.1","250.2","250.3","250.5","250.9")

for(i in diag0){
  MyData_str[MyData_str==i]<-paste0(i,"0")
}

# test the codes
unique(MyData_str$diag_1[startsWith(MyData_str$diag_1, diag0)])
##  [1] "250.40" "250.60" "250.32" "250.70" "250.13" "250.11" "250.80"
##  [8] "250.82" "250.12" "250.81" "250.83" "250.42" "250.33" "250.41"
## [15] "250.23" "250.22" "250.43" "250.92" "250.50" "250.20" "250.10"
## [22] "250.30" "250.21"
unique(MyData_str$diag_2[startsWith(MyData_str$diag_2, diag0)])
##  [1] "250.60" "250.13" "250.83" "250.41" "250.50" "250.81" "250.11"
##  [8] "250.22" "250.70" "250.40" "250.51" "250.42" "250.53" "250.12"
## [15] "250.52" "250.91" "250.93" "250.10" "250.92" "250.82" "250.43"
## [22] "250.80" "250.32" "250.33" "250.20"
unique(MyData_str$diag_3[startsWith(MyData_str$diag_3, diag0)])
##  [1] "250.70" "250.60" "250.43" "250.41" "250.50" "250.52" "250.82"
##  [8] "250.53" "250.40" "250.81" "250.92" "250.80" "250.51" "250.83"
## [15] "250.11" "250.42" "250.91" "250.22" "250.93" "250.12" "250.13"
## [22] "250.90"

Get the 6 digit numbers

lessThan7<-diags[nchar(diags)==6]
unique(lessThan7)
##  [1] "250.83" "250.11" "250.32" "250.13" "250.03" "250.02" "250.42"
##  [8] "250.41" "250.22" "250.82" "250.33" "250.12" "250.81" "250.93"
## [15] "250.01" "250.31" "250.43" "250.23" "250.92" "250.53" "250.21"
## [22] "250.52" "250.51" "250.91" "365.44"
cat("\n")
print(paste("The number of unique values: ",length(unique(lessThan7))))
## [1] "The number of unique values:  25"

All are valid codes

Export Data

write.csv(MyData_str,row.names=FALSE, file = "formated_diags.csv")

Invalid Codes

print("list of the codes that doesn't match with Find-A-Code:")
## [1] "list of the codes that doesn't match with Find-A-Code:"
nonMatchingCodes
##  [1] "V53" "V71" "V54" "V25" "V15" "V61" "58"  "187" "284" "275" "258"
## [12] "260" "243" "350" "312" "362" "323" "444" "445" "405" "484" "584"
## [23] "558" "585" "517" "645" "690" "624" "780" "787" "799" "790" "959"
Invalid_diag1<-MyData_str$diag_1[startsWith(MyData_str$diag_1, nonMatchingCodes)]
Invalid_diag2<-MyData_str$diag_2[startsWith(MyData_str$diag_2, nonMatchingCodes)]
Invalid_diag3<-MyData_str$diag_3[startsWith(MyData_str$diag_3, nonMatchingCodes)]

Invalid_diag1<-Invalid_diag1[nchar(Invalid_diag1)==3]
Invalid_diag2<-Invalid_diag2[nchar(Invalid_diag2)==3]
Invalid_diag3<-Invalid_diag3[nchar(Invalid_diag3)==3]



InvalidDiagsEntries<-c(Invalid_diag1,Invalid_diag2,Invalid_diag3)

cat("\n")
print("Print 20 entries of the invalid codes:")
## [1] "Print 20 entries of the invalid codes:"
head(InvalidDiagsEntries,20)
##  [1] "780" "584" "780" "584" "584" "959" "780" "584" "780" "780" "787"
## [12] "585" "780" "799" "585" "780" "584" "584" "787" "780"
cat("\n")
print(paste( "There are a ",length(InvalidDiagsEntries)," entries that are not valid"))
## [1] "There are a  794  entries that are not valid"

Download Transformed Data

library(magrittr)
readLines("formated_diags.csv") %>% 
  paste0(collapse="\n") %>% 
  openssl::base64_encode() -> encoded

NOTE:

When it prompt to download GIVE THE DOCUMENT A NAME with .csv as file extension

Download formated_diags.CSV